```
---
title: "Get BRCA Rna-Seq Data from TCGA and prepare BRCA Subtype dataset"
output: html_document
date: '2023-02-24'
---
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(TCGAbiolinks)
library(SummarizedExperiment)
```
# Download breast cancer gene expression data from The Cancer Genome Atlas using api within TCGAbiolinks package
# and save the file in brca.rda as an SummarizedExperiment object 
```{r}
projects<-c("BRCA")
project_files <- c("brca.rda")

cnt =length(projects)

project_name <- "BRCA"
project_file <- "brca.rda"
projectTCGA <- "TCGA-BRCA"
query <- GDCquery(project = projectTCGA,
                              data.category = "Transcriptome Profiling",
                              data.type = "Gene Expression Quantification",
                              workflow.type = "STAR - Counts")
GDCdownload(query, method="api")
project_name <- GDCprepare(query = query, summarizedExperiment = TRUE)
save(project_name,file=project_file)

```

```{r}
#Prepare mRNA lists of specific types
#load(file="brca.rda")
BRCA <- project_name
df <- assays(project_name)$fpkm_unstrand

genes <- rownames(df)
rnakeep <- elementMetadata(BRCA)$gene_type=="protein_coding"
mRNAgenes <- rownames(df[rnakeep,])
rnakeep <- elementMetadata(BRCA)$gene_type=="miRNA"
miRNAgenes <- rownames(df[rnakeep,])
rnakeep <- elementMetadata(BRCA)$gene_type=="lncRNA"
lncRNAgenes <- rownames(df[rnakeep,])

RNA3 <- union(mRNAgenes,miRNAgenes)
RNA3 <- union(RNA3,lncRNAgenes)
othRNAgenes = setdiff(genes,RNA3)

rm(rnakeep)
rm(df)
rm(RNA3)
```

```{r}

project_file <- "brca.rda"
load(file=paste(project_file))

se <- project_name

keep <- which(colData(se)$sample_type == "Solid Tissue Normal")
sen <- se[,keep]
sent <- assays(sen)$fpkm_unstrand
sets0 <- sent
colnames(sets0) <- colnames(sent)
  
if(isEmpty(keep)) scntN <- 0 else scntN <-length(keep)
  
rm(keep)
keepT <- which(colData(se)$sample_type == "Primary Tumor")
set <- se[,keepT]
  
keep1 <- which(colData(set)$paper_BRCA_Subtype_PAM50 == "Basal")
sett <- set[,keep1]
sett <- assays(sett)$fpkm_unstrand
bBa <- sett
colnames(bBa) <- colnames(sett)
    
keep1 <- which(colData(set)$paper_BRCA_Subtype_PAM50 == "Her2")
sett <- set[,keep1]
sett <- assays(sett)$fpkm_unstrand
bHe <- sett
colnames(bHe) <- colnames(sett)
  
keep1 <- which(colData(set)$paper_BRCA_Subtype_PAM50 == "LumA")
sett <- set[,keep1]
sett <- assays(sett)$fpkm_unstrand
bLA <- sett
colnames(bLA) <- colnames(sett)
  
keep1 <- which(colData(set)$paper_BRCA_Subtype_PAM50 == "LumB")
sett <- set[,keep1]
sett <- assays(sett)$fpkm_unstrand
bLB <- sett
colnames(bLB) <- colnames(sett)
  
keep1 <- which(colData(set)$paper_BRCA_Subtype_PAM50 == "Normal")
sett <- set[,keep1]
sett <- assays(sett)$fpkm_unstrand
bNo <- sett
colnames(bNo) <- colnames(sett)
  
labels5 <- rep(0,ncol(bBa))
labels5 <- append(labels5,rep(1,ncol(bHe)))
labels5 <- append(labels5,rep(2,ncol(bLA)))
labels5 <- append(labels5,rep(3,ncol(bLB)))
labels5 <- append(labels5,rep(4,ncol(bNo)))

setnf5 <- cbind(bBa,bHe)
setnf5 <- cbind(setnf5,bLA)
setnf5 <- cbind(setnf5,bLB)
setnf5 <- cbind(setnf5,bNo)

rm(keep1,keepT,se,sen,sent,set,sets0,sett)
rm(project_file,project_name,scntN)

```
```{r}
# Transpose dataset to set features as column headers
setnf5 <- t(setnf5)
```
```{r}
# Remove features from dataset with mean expression value < 0.075
RNAset_filter <- setnf5[,which((colMeans(setnf5) >=0.04) == TRUE)]
RNAgenes_filter = colnames(RNAset_filter)

othRNAgenes_filter = intersect(othRNAgenes,RNAgenes_filter)
mRNAgenes_filter = intersect(mRNAgenes,RNAgenes_filter)
miRNAgenes_filter = intersect(miRNAgenes,RNAgenes_filter)
lncRNAgenes_filter = intersect(lncRNAgenes,RNAgenes_filter)

# Partition dataset as per RNA types
mRNAset_filter <- setnf5[,mRNAgenes_filter]
miRNAset_filter <- setnf5[,miRNAgenes_filter]
lncRNAset_filter <- setnf5[,lncRNAgenes_filter]
othRNAset_filter <- setnf5[,othRNAgenes_filter]

print(c("RNA",dim(RNAset_filter)))
print(c("mRNA",dim(mRNAset_filter)))
print(c("miRNA",dim(miRNAset_filter)))
print(c("lncRNA",dim(lncRNAset_filter)))
print(c("othRNA",dim(othRNAset_filter)))

```

```{r}
RNAset_filter <- log(RNAset_filter+1,2)
write.table(labels5,"TCGA_BRCA_SubTypes/labels.csv",sep=",",col.names=FALSE,row.names=FALSE)
write.csv(RNAset_filter,"TCGA_BRCA_SubTypes/dataL2.csv")

mRNAset_filter <- log(mRNAset_filter+1,2)
miRNAset_filter <- log(miRNAset_filter+1,2)
lncRNAset_filter <- log(lncRNAset_filter+1,2)
othRNAset_filter <- log(othRNAset_filter+1,2)

write.csv(mRNAset_filter,"TCGA_BRCA_SubTypes/dataL2-mRNA.csv")
write.csv(miRNAset_filter,"TCGA_BRCA_SubTypes/dataL2-miRNA.csv")
write.csv(lncRNAset_filter,"TCGA_BRCA_SubTypes/dataL2-lncRNA.csv")
write.csv(othRNAset_filter,"TCGA_BRCA_SubTypes/dataL2-othRNA.csv")

rm(keepT,cnt,project_files,project_file,projects)

```
